{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 协同过滤"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "说明：1、打分采用的是((interested - not_interested)+1)*0.5，即分值有三种，（1，0）是1，表示感兴趣（0，1）是0，表示不敢兴趣，（0，0）、（1，1）是0.5，表示不确定，可能是没填或者填错，不确定；2、只做了协同过滤部分函数，其它部分采用老师的代码。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "from numpy.random import random  \n",
    "from collections import defaultdict\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class RecommonderSystem:\n",
    "    def __init__(self): \n",
    "        #类的初始化，未改动老师代码\n",
    "        \n",
    "        #train、test数据集中的数字化的用户索引、活动索引\n",
    "        self.userIndex = pickle.load(open(\"PE_userIndex.pkl\", 'rb'))\n",
    "        self.eventIndex = pickle.load(open(\"PE_eventIndex.pkl\", 'rb'))\n",
    "        #用户数量、活动数量\n",
    "        self.n_users = len(self.userIndex)\n",
    "        self.n_events = len(self.eventIndex)\n",
    "        #用户-活动矩阵，填充的是train测试集中稀疏的评分      \n",
    "        self.userEventScores = sio.mmread(\"PE_userEventScores\").todense()\n",
    "        ##每个用户参加的事件 ，事件参加的用户\n",
    "        self.eventsForUser = pickle.load(open(\"PE_eventsForUser.pkl\", 'rb'))\n",
    "        self.usersForEvent = pickle.load(open(\"PE_usersForEvent.pkl\", 'rb'))\n",
    "        \n",
    "        #基于模型的协同过滤参数初始化,训练\n",
    "        self.init_SVD()\n",
    "        self.train_SVD(trainfile = \"train.csv\")\n",
    "        \n",
    "        #根据用户属性计算出的用户之间的相似度\n",
    "        self.userSimMatrix = sio.mmread(\"US_userSimMatrix\").todense()\n",
    "        \n",
    "        #根据活动属性计算出的活动之间的相似度\n",
    "        self.eventPropSim = sio.mmread(\"EV_eventPropSim\").todense()\n",
    "        self.eventContSim = sio.mmread(\"EV_eventContSim\").todense()\n",
    "        \n",
    "        #每个用户的朋友的数目\n",
    "        self.numFriends = sio.mmread(\"UF_numFriends\")\n",
    "        \n",
    "        #用户的每个朋友参加活动的分数对该用户的影响\n",
    "        self.userFriends = sio.mmread(\"UF_userFriends\").todense()\n",
    "        \n",
    "        #活动本身的热度\n",
    "        self.eventPopularity = sio.mmread(\"EA_eventPopularity\").todense()\n",
    "        \n",
    "        \n",
    "        \n",
    "    def init_SVD(self, K=20):\n",
    "        #SVD初始化，将老师代码中的self.bu进行了转置，便于后续计算\n",
    "        self.K = K          \n",
    "        self.bi = np.mat(np.zeros(self.n_events))  \n",
    "        self.bu = np.mat((np.zeros(self.n_users))).T    \n",
    "        self.P = random((self.n_users,self.K))/10*(np.sqrt(self.K))\n",
    "        self.Q = random((self.K, self.n_events))/10*(np.sqrt(self.K))     \n",
    "        \n",
    "    def train_SVD(self,trainfile = 'train.csv', steps=500,gamma=0.04,Lambda=0.15):\n",
    "        #steps为迭代次数，gamma为迭代因子，Lambda为正则参数\n",
    "        \n",
    "        #计算所有样本的均值\n",
    "        self.mu=0\n",
    "        num=0        \n",
    "        \n",
    "        ftrain = open(trainfile,'r')\n",
    "        ftrain.readline()       \n",
    "        \n",
    "        #初始化位置矩阵，train中样本对应位置为1，其他位置为0\n",
    "        matrix_train=np.zeros(self.userEventScores.shape)        \n",
    "        \n",
    "        for line in ftrain:\n",
    "            cols = line.strip().split(\",\")\n",
    "            i = self.userIndex[cols[0]]  \n",
    "            j = self.eventIndex[cols[1]] \n",
    "            #对存在样本的矩阵元素赋值为1\n",
    "            matrix_train[i,j]=1\n",
    "            self.mu=self.mu+self.userEventScores[i,j]\n",
    "            num=num+1\n",
    "        ftrain.close()\n",
    "        #求均值\n",
    "        self.mu=self.mu/num\n",
    "        \n",
    "        #初始化SSE\n",
    "        self.SSEscore=[]\n",
    "        #采用批量随机梯度下降，每次随机对1/piece个样本进行计算\n",
    "        self.piece=100\n",
    "        for step in range(steps):  \n",
    "            #随机位置矩阵，随机1/piece个样本的位置为1，其它位置为0\n",
    "            matrix_piece=np.int64(np.multiply(matrix_train,random(matrix_train.shape))>(1-(1/self.piece)))\n",
    "            #预测矩阵，对R预测\n",
    "            Rpred=self.mu+(self.P).dot(self.Q)\n",
    "            Rpred=Rpred+self.bi\n",
    "            Rpred=Rpred+self.bu\n",
    "            #预测的误差矩阵\n",
    "            Eui=self.userEventScores-np.multiply(Rpred,matrix_train)\n",
    "            #1/piece个样本的误差矩阵，全部样本会产生梯度爆炸\n",
    "            Eui_piece=np.multiply(Eui,matrix_piece)\n",
    "            \n",
    "            #对全部样本计算SSE\n",
    "            SSE_Eui=0.5*np.sum(np.multiply(Eui,Eui))\n",
    "            SSE_P=0.5*Lambda*np.sum(np.multiply(self.P,self.P))\n",
    "            SSE_Q=0.5*Lambda*np.sum(np.multiply(self.Q,self.Q))\n",
    "            SSE_bu=0.5*Lambda*np.sum(np.multiply(self.bu,self.bu))\n",
    "            SSE_bi=0.5*Lambda*np.sum(np.multiply(self.bi,self.bi))\n",
    "            SSE=SSE_Eui+SSE_P+SSE_Q+SSE_bu+SSE_bi\n",
    "            self.SSEscore.append(SSE)\n",
    "            \n",
    "            #利用1/piece个样本的误差矩阵计算梯度\n",
    "            gradP=-Eui_piece.dot((self.Q).T)+Lambda*(self.P)\n",
    "            gradQ=-((self.P).T).dot(Eui_piece)+Lambda*(self.Q)\n",
    "            gradbu=-Eui_piece.sum(axis=1)+Lambda*(self.bu)\n",
    "            gradbi=-Eui_piece.sum(axis=0)+Lambda*(self.bi)\n",
    "            #迭代\n",
    "            self.P=self.P-gamma*gradP\n",
    "            self.Q=self.Q-gamma*gradQ\n",
    "            self.bu=self.bu-gamma*gradbu\n",
    "            self.bi=self.bi-gamma*gradbi\n",
    "            \n",
    "            #print (SSE)        \n",
    "    def pred_SVD(self, uid, i_id):\n",
    "        #根据当前参数，预测用户uid对Item（i_id）的打分    \n",
    "        ans=self.mu + self.bi[0,i_id] + self.bu[uid] + np.dot(self.P[uid,:],self.Q[:,i_id])  \n",
    "        #将打分范围控制在0-1之间\n",
    "        if ans>1:  \n",
    "            return 1  \n",
    "        elif ans<0:  \n",
    "            return 0\n",
    "        return ans  \n",
    "        return 0\n",
    "    def svdCFReco(self, userId, eventId):\n",
    "        #基于模型的协同过滤, SVD++/LFM\n",
    "        u = self.userIndex[userId]\n",
    "        i = self.eventIndex[eventId]\n",
    "        return self.pred_SVD(u,i)\n",
    "\n",
    "    def sim_cal_UserCF(self, uid1, uid2 ):\n",
    "        u1 = self.userIndex[uid1]\n",
    "        u2 = self.userIndex[uid2]\n",
    "        event1=self.eventsForUser[u1]            #对用户u1存在打分的活动event1\n",
    "        event2=self.eventsForUser[u2]            #对用户u2存在打分的活动event2\n",
    "        event_both=event1&event2                 \n",
    "        #两个用户共同打分的活动，活动数13418，train中数目15398，估计交叉活动很少，效果不会很好        \n",
    "        mu1=0\n",
    "        mu2=0        \n",
    "        if len(event_both)==0:\n",
    "            return 0\n",
    "        else:\n",
    "            for i in event1:\n",
    "                mu1=mu1+self.userEventScores[u1,i]\n",
    "            for i in event2:\n",
    "                mu2=mu2+self.userEventScores[u2,i]\n",
    "            mu1=mu1/len(event1)                 #用户1的平均打分\n",
    "            mu2=mu2/len(event2)                 #用户2的平均打分\n",
    "\n",
    "            #在共同活动上的打分向量（去均值）\n",
    "            score1=self.userEventScores[u1,[i for i in event_both]]-mu1                 \n",
    "            score2=self.userEventScores[u2,[i for i in event_both]]-mu2\n",
    "            num=score1*(score2.T)         \n",
    "            den=np.sqrt(np.sum(np.multiply(score1,score1))*np.sum(np.multiply(score2,score2))) \n",
    "            if den!=0:            \n",
    "                similarity=num/den   \n",
    "            else:   \n",
    "                #此时分母为0，有一个用户分数不变，相关系数不能用了，用两个用户相同分数的个数除以分数的个数作为相似度                           \n",
    "                number_both=len(score1)\n",
    "                number_same=(score1==score2).sum()\n",
    "                similarity=(number_same/number_both)*2-1 #归一化到[-1，1]\n",
    "            return similarity      \n",
    "    def userCFReco(self, userId, eventId):\n",
    "        users= self.usersForEvent[self.eventIndex[eventId]]\n",
    "        if len(users)==0:\n",
    "            return 0\n",
    "        else:\n",
    "            #计算用户自身对不同活动的打分均值,后面计算用\n",
    "            u = self.userIndex[userId]     \n",
    "            event=self.eventsForUser[u]         \n",
    "            mu=0\n",
    "            if len(event)==0:\n",
    "                mu=0\n",
    "            else:\n",
    "                for i in event:\n",
    "                    mu=mu+self.userEventScores[u,i]\n",
    "                mu=mu/len(event)    \n",
    "            \n",
    "            #计算相似度和去均值的打分偏置\n",
    "            sim=np.zeros(len(users))\n",
    "            score=np.zeros(len(users))\n",
    "            \n",
    "            for m,n in enumerate(users): \n",
    "                otherId=self.get_keys(self.userIndex,n)\n",
    "                sim[m]=self.sim_cal_UserCF(userId,otherId)                \n",
    "                score[m]=self.userEventScores[n,self.eventIndex[eventId]]\n",
    "                #求每个相似用户的均值\n",
    "                mu_n=0\n",
    "                event_n=self.eventsForUser[n]                 \n",
    "                for i in event_n:\n",
    "                      mu_n=mu_n+self.userEventScores[n,i]\n",
    "                mu_n=mu_n/len(event_n)\n",
    "                score[m]=score[m]-mu_n      #打分去均值\n",
    "            x=(sim>0)\n",
    "            sim=sim[x]\n",
    "            score=score[x]\n",
    "            if len(sim)==0:                 #相似度大于0的用户个数\n",
    "                return mu\n",
    "            else:\n",
    "                return ((sim.dot(score))/sim.sum())+mu                \n",
    "    def sim_cal_EventCF(self, e_id1, e_id2):\n",
    "        e1 = self.eventIndex[e_id1]\n",
    "        e2 = self.eventIndex[e_id2]\n",
    "        user1=self.usersForEvent[e1]            #对用户u1存在打分的活动event1\n",
    "        user2=self.usersForEvent[e2]            #对用户u2存在打分的活动event2\n",
    "        user_both=user1&user2                 \n",
    "        #两个用户共同打分的活动，活动数13418，train中数目15398，估计交叉活动很少，效果不会很好        \n",
    "        mu1=0\n",
    "        mu2=0        \n",
    "        if len(user_both)==0:\n",
    "            return 0\n",
    "        else:\n",
    "            for i in user1:\n",
    "                mu1=mu1+self.userEventScores[i,e1]\n",
    "            for i in user2:\n",
    "                mu2=mu2+self.userEventScores[i,e2]\n",
    "            mu1=mu1/len(user1)                 #用户1的平均打分\n",
    "            mu2=mu2/len(user2)                 #用户2的平均打分\n",
    "\n",
    "            #在共同活动上的打分向量（去均值）\n",
    "            score1=self.userEventScores[[i for i in user_both],e1]-mu1                 \n",
    "            score2=self.userEventScores[[i for i in user_both],e2]-mu2\n",
    "            num=(score1.T)*score2         \n",
    "            den=np.sqrt(np.sum(np.multiply(score1,score1))*np.sum(np.multiply(score2,score2))) \n",
    "            if den!=0:            \n",
    "                similarity=num/den   \n",
    "            else:   \n",
    "                #此时分母为0，有一个用户分数不变，相关系数不能用了，用两个用户相同分数的个数除以分数的个数作为相似度                           \n",
    "                number_both=len(score1)\n",
    "                number_same=(score1==score2).sum()\n",
    "                similarity=(number_same/number_both)*2-1 #归一化到[-1，1]\n",
    "            return similarity      \n",
    "    def eventCFReco(self, userId, eventId):  \n",
    "        events= self.eventsForUser[self.userIndex[userId]]\n",
    "        if len(events)==0:\n",
    "            return 0\n",
    "        else:\n",
    "            #计算用户自身对不同活动的打分均值,后面计算用\n",
    "            e = self.eventIndex[eventId]     \n",
    "            user=self.usersForEvent[e]         \n",
    "            mu=0\n",
    "            if len(user)==0:\n",
    "                mu=0\n",
    "            else:\n",
    "                for i in user:\n",
    "                    mu=mu+self.userEventScores[i,e]\n",
    "                mu=mu/len(user)    \n",
    "            \n",
    "            #计算相似度和去均值的打分偏置\n",
    "            sim=np.zeros(len(events))\n",
    "            score=np.zeros(len(events))\n",
    "            \n",
    "            for m,n in enumerate(events): \n",
    "                otherId=self.get_keys(self.eventIndex,n)\n",
    "                sim[m]=self.sim_cal_EventCF(eventId,otherId)                \n",
    "                score[m]=self.userEventScores[self.userIndex[userId],n]\n",
    "                #求每个相似用户的均值\n",
    "                mu_n=0\n",
    "                user_n=self.usersForEvent[n]                 \n",
    "                for i in user_n:\n",
    "                      mu_n=mu_n+self.userEventScores[i,n]\n",
    "                mu_n=mu_n/len(user_n)\n",
    "                score[m]=score[m]-mu_n      #打分去均值\n",
    "            x=(sim>0)\n",
    "            sim=sim[x]\n",
    "            score=score[x]\n",
    "            if len(sim)==0:                 #相似度大于0的用户个数\n",
    "                return mu\n",
    "            else:\n",
    "                return ((sim.dot(score))/sim.sum())+mu                 \n",
    "    def userReco(self, userId, eventId):\n",
    "        #user的协同过滤，相似度基于user本身属性计算\n",
    "        i = self.userIndex[userId]\n",
    "        j = self.eventIndex[eventId]\n",
    "        \n",
    "        vs = self.userEventScores[:, j]\n",
    "        \n",
    "        sims = self.userSimMatrix[i, :]\n",
    "        prod = sims * vs\n",
    "        try:\n",
    "            return (prod[0, 0] - self.userEventScores[i, j])                #/(sims.sum()-1)\n",
    "        except IndexError:\n",
    "            return 0\n",
    "        \n",
    "    def eventReco(self, userId, eventId):\n",
    "        i = self.userIndex[userId]\n",
    "        j = self.eventIndex[eventId]\n",
    "        js = self.userEventScores[i, :]\n",
    "        psim = self.eventPropSim[:, j]\n",
    "        csim = self.eventContSim[:, j]\n",
    "        pprod = js * psim\n",
    "        cprod = js * csim  \n",
    "        pscore = 0\n",
    "        cscore = 0\n",
    "        try:\n",
    "            pscore = (pprod[0, 0] - self.userEventScores[i, j])             #/(psim.sum()-1)\n",
    "        except IndexError:\n",
    "            pass\n",
    "        try:\n",
    "            cscore = (cprod[0, 0] - self.userEventScores[i, j])             #/(csim.sum()-1)\n",
    "        except IndexError:\n",
    "            pass\n",
    "        return pscore, cscore           \n",
    "    def userPop(self, userId):  \n",
    "        #根据用户的朋友数量预测，返回值是归一化的朋友数量\n",
    "        if userId in self.userIndex:\n",
    "            i = self.userIndex[userId]\n",
    "            try:\n",
    "                return self.numFriends[0,i]\n",
    "            except IndexError:\n",
    "                return 0             \n",
    "        else:\n",
    "            return 0\n",
    "    def friendInfluence(self, userId):    \n",
    "        #朋友对用户的影响，朋友喜欢社交活动对当前用户有一定的影响\n",
    "        #函数返回值为该用户所有朋友参加活动平均频率的平均值\n",
    "        nusers = np.shape(self.userFriends)[1]\n",
    "        i = self.userIndex[userId]\n",
    "        return (self.userFriends[i, :].sum(axis=1) / nusers)[0,0]   \n",
    "    def eventPop(self, eventId):\n",
    "        #统计活动参加和不参加的人数，计算活动热度\n",
    "        i = self.eventIndex[eventId]\n",
    "        return self.eventPopularity[i, 0]\n",
    "    def get_keys(self,d,value):  \n",
    "        for k,v in d.items():\n",
    "            if v==value:\n",
    "                key=k\n",
    "        return key  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generateRSData(RS, train=True, header=True):\n",
    "    \n",
    "    #特征组合,生成新的训练数据，用于分类器分类使用 \n",
    "    fn = \"train.csv\" if train else \"test.csv\"\n",
    "    fin = open(fn, 'r')\n",
    "    fout = open(\"RS_\" + fn, 'wb')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    fin.readline().strip().split(\",\")\n",
    "    \n",
    "    # write output header\n",
    "    if header:\n",
    "        ocolnames = [\"invited\", \"userCF_reco\", \"evtCF_reco\",\"svdCF_reco\",\"user_reco\", \"evt_p_reco\",\\\n",
    "                     \"evt_c_reco\", \"user_pop\", \"frnd_infl\", \"evt_pop\"]\n",
    "    if train:\n",
    "        ocolnames.append(\"interested\")\n",
    "        ocolnames.append(\"not_interested\")\n",
    "    fout.write((\",\".join(ocolnames) + \"\\n\").encode(encoding='utf-8'))\n",
    "    \n",
    "    ln = 0\n",
    "    for line in fin:\n",
    "        ln += 1\n",
    "        if ln%500 == 0:\n",
    "            print (\"%s:%d (userId, eventId)=(%s, %s)\"% (fn, ln, userId, eventId)) \n",
    "            #break;    \n",
    "        cols = line.strip().split(\",\")\n",
    "        userId = cols[0]\n",
    "        eventId = cols[1]\n",
    "        invited = cols[2]    \n",
    "        \n",
    "        userCF_reco = RS.userCFReco(userId, eventId)\n",
    "        eventCF_reco =RS.eventCFReco(userId, eventId)\n",
    "        svdCF_reco =RS.svdCFReco(userId, eventId)\n",
    "                        \n",
    "        \n",
    "        user_reco =RS.userReco(userId, eventId)\n",
    "        evt_p_reco, evt_c_reco =RS.eventReco(userId, eventId)\n",
    "        user_pop = RS.userPop(userId)    \n",
    "        \n",
    "        frnd_infl = RS.friendInfluence(userId)\n",
    "        evt_pop = RS.eventPop(eventId)\n",
    "        ocols = [invited, userCF_reco, eventCF_reco, svdCF_reco,user_reco, evt_p_reco,\n",
    "        evt_c_reco, user_pop, frnd_infl, evt_pop]     \n",
    "        if train:\n",
    "            ocols.append(cols[4]) # interested\n",
    "            ocols.append(cols[5]) # not_interested\n",
    "        fout.write((\",\".join(map(lambda x: str(x), ocols))+ \"\\n\").encode(encoding='utf-8'))   \n",
    "    fin.close()\n",
    "    fout.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "生成训练数据...\n",
      "\n",
      "train.csv:500 (userId, eventId)=(123290209, 1887085024)\n",
      "train.csv:1000 (userId, eventId)=(272886293, 199858305)\n",
      "train.csv:1500 (userId, eventId)=(395305791, 1582270949)\n",
      "train.csv:2000 (userId, eventId)=(527523423, 3272728211)\n",
      "train.csv:2500 (userId, eventId)=(651258472, 792632006)\n",
      "train.csv:3000 (userId, eventId)=(811791433, 524756826)\n",
      "train.csv:3500 (userId, eventId)=(985547042, 1269035551)\n",
      "train.csv:4000 (userId, eventId)=(1107615001, 173949238)\n",
      "train.csv:4500 (userId, eventId)=(1236336671, 3849306291)\n",
      "train.csv:5000 (userId, eventId)=(1414301782, 2652356640)\n",
      "train.csv:5500 (userId, eventId)=(1595465532, 955398943)\n",
      "train.csv:6000 (userId, eventId)=(1747091728, 2131379889)\n",
      "train.csv:6500 (userId, eventId)=(1914182220, 955398943)\n",
      "train.csv:7000 (userId, eventId)=(2071842684, 1076364848)\n",
      "train.csv:7500 (userId, eventId)=(2217853337, 3051438735)\n",
      "train.csv:8000 (userId, eventId)=(2338481531, 2525447278)\n",
      "train.csv:8500 (userId, eventId)=(2489551967, 520657921)\n",
      "train.csv:9000 (userId, eventId)=(2650493630, 87962584)\n",
      "train.csv:9500 (userId, eventId)=(2791418962, 4223848259)\n",
      "train.csv:10000 (userId, eventId)=(2903662804, 2791462807)\n",
      "train.csv:10500 (userId, eventId)=(3036141956, 3929507420)\n",
      "train.csv:11000 (userId, eventId)=(3176074542, 3459485614)\n",
      "train.csv:11500 (userId, eventId)=(3285425249, 2271782630)\n",
      "train.csv:12000 (userId, eventId)=(3410667855, 1063772489)\n",
      "train.csv:12500 (userId, eventId)=(3531604778, 2584839423)\n",
      "train.csv:13000 (userId, eventId)=(3686871863, 53495098)\n",
      "train.csv:13500 (userId, eventId)=(3833637800, 2415873572)\n",
      "train.csv:14000 (userId, eventId)=(3944021305, 2096772901)\n",
      "train.csv:14500 (userId, eventId)=(4075466480, 3567240505)\n",
      "train.csv:15000 (userId, eventId)=(4197193550, 1628057176)\n",
      "生成预测数据...\n",
      "\n",
      "test.csv:500 (userId, eventId)=(182290053, 2529072432)\n",
      "test.csv:1000 (userId, eventId)=(433510318, 4244463632)\n",
      "test.csv:1500 (userId, eventId)=(632808865, 2845303452)\n",
      "test.csv:2000 (userId, eventId)=(813611885, 2036538169)\n",
      "test.csv:2500 (userId, eventId)=(1010701404, 303459881)\n",
      "test.csv:3000 (userId, eventId)=(1210932037, 2529072432)\n",
      "test.csv:3500 (userId, eventId)=(1452921099, 2705317682)\n",
      "test.csv:4000 (userId, eventId)=(1623287180, 1626678328)\n",
      "test.csv:4500 (userId, eventId)=(1855201342, 2603032829)\n",
      "test.csv:5000 (userId, eventId)=(2083900381, 2529072432)\n",
      "test.csv:5500 (userId, eventId)=(2318415276, 2509151803)\n",
      "test.csv:6000 (userId, eventId)=(2528161539, 4025975316)\n",
      "test.csv:6500 (userId, eventId)=(2749110768, 4244406355)\n",
      "test.csv:7000 (userId, eventId)=(2927772127, 1532377761)\n",
      "test.csv:7500 (userId, eventId)=(3199685636, 1776393554)\n",
      "test.csv:8000 (userId, eventId)=(3393388475, 680270887)\n",
      "test.csv:8500 (userId, eventId)=(3601169721, 154434302)\n",
      "test.csv:9000 (userId, eventId)=(3828963415, 3067222491)\n",
      "test.csv:9500 (userId, eventId)=(4018723397, 2522610844)\n",
      "test.csv:10000 (userId, eventId)=(4180064266, 2658555390)\n"
     ]
    }
   ],
   "source": [
    "RS = RecommonderSystem()\n",
    "print (\"生成训练数据...\\n\")\n",
    "generateRSData(RS,train=True,  header=True)\n",
    "\n",
    "print (\"生成预测数据...\\n\")\n",
    "generateRSData(RS, train=False, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0,0.5,'SSE')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEKCAYAAADaa8itAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3XmUnHWd7/H3t6p676SXpLPQ6SyagECEBDIBheMCCgG9Bh24F0YlKmcyc2Uc5+odB86dkRmVe3XGGZSZK0cUruj1gojMsLhADCiuQIc1C6TDkhCzddKdztJJb/W9f9Svk0qneq+qp6vr8zqnTj3P7/k91d9faPLJ79nK3B0REZFsiEVdgIiITB4KFRERyRqFioiIZI1CRUREskahIiIiWaNQERGRrFGoiIhI1ihUREQkaxQqIiKSNYmoC8i36dOn+/z586MuQ0SkYKxbt26vuzeMpG/Rhcr8+fNpbm6OugwRkYJhZltH2leHv0REJGsUKiIikjU5CxUzu9PM9pjZ+rS2ejNbY2Yt4b0utJuZ3WpmW8zsBTM7J22fVaF/i5mtSms/18xeDPvcamaWq7GIiMjI5HKm8h1gxYC2G4C17r4IWBvWAS4DFoXXauA2SIUQcBNwHrAcuKk/iEKf1Wn7DfxZIiKSZzkLFXd/Amgb0LwSuCss3wVckdb+XU/5PVBrZrOBS4E17t7m7u3AGmBF2DbV3X/nqS+E+W7aZ4mISETyfU5lprvvBAjvM0J7I/BGWr/toW2o9u0Z2kVEJEIT5UR9pvMhPob2zB9uttrMms2subW1dYwliojIcPIdKrvDoSvC+57Qvh1oSus3B9gxTPucDO0Zufvt7r7M3Zc1NIzo/p2T3Lq2hV9uViCJiAwl36HyINB/Bdcq4IG09mvDVWDnAx3h8NgjwCVmVhdO0F8CPBK2HTSz88NVX9emfVZOfPOXr/DLlxUqIiJDydkd9WZ2N/AuYLqZbSd1FdeXgXvN7DpgG3BV6P4T4HJgC9AJfBzA3dvM7IvA06HfF9y9/+T/fyV1hVkF8NPwypnq8gSHu3pz+SNERApezkLF3a8ZZNPFGfo6cP0gn3MncGeG9mZg8XhqHI2qsgSHFCoiIkOaKCfqJ7wpChURkWEpVEZIMxURkeEpVEaoukznVEREhqNQGaFqzVRERIalUBmh6nKFiojIcBQqI1QVDn+lLlQTEZFMFCojVF2WoKfP6epNRl2KiMiEpVAZoeqy1C09OlkvIjI4hcoI9YeKzquIiAxOoTJCNRUlAHQc6Ym4EhGRiUuhMkJ1ValQaTvcHXElIiITl0JlhGorSwHY36mZiojIYBQqI1QXQqW9UzMVEZHBKFRGqKaiBDNo10xFRGRQCpURiseMmooS9mumIiIyKIXKKNRVlupEvYjIEBQqo1BbWaIT9SIiQ1CojEJdZalO1IuIDEGhMgqaqYiIDE2hMgr1OqciIjIkhcoo1FWVcqSnj6M9fVGXIiIyISlURqG2MvWoFh0CExHJTKEyCrqrXkRkaAqVUeifqShUREQyU6iMQn1VmKkc1uEvEZFMFCqjoMNfIiJDU6iMwvET9QoVEZFMFCqjUJaIU1ka15OKRUQGoVAZJT2qRURkcAqVUaqrKqFdd9WLiGSkUBml1ExFh79ERDJRqIxSbWWpTtSLiAxCoTJKdZUlmqmIiAwiklAxs/9mZhvMbL2Z3W1m5Wa2wMyeNLMWM/uBmZWGvmVhfUvYPj/tc24M7S+b2aX5qL2uspSOIz309CXz8eNERApK3kPFzBqBvwSWuftiIA5cDXwFuMXdFwHtwHVhl+uAdndfCNwS+mFmZ4T9zgRWAN8ws3iu658+pQxAJ+tFRDKI6vBXAqgwswRQCewELgLuC9vvAq4IyyvDOmH7xWZmof0ed+9y99eALcDyXBfeUJ26q771UFeuf5SISMHJe6i4+x+ArwLbSIVJB7AO2O/uvaHbdqAxLDcCb4R9e0P/aentGfY5gZmtNrNmM2tubW0dV/3TqlMzlb2HNFMRERkoisNfdaRmGQuAU4Aq4LIMXb1/l0G2DdZ+cqP77e6+zN2XNTQ0jL7oNNNDqOzTTEVE5CRRHP56D/Cau7e6ew9wP/B2oDYcDgOYA+wIy9uBJoCwvQZoS2/PsE/OTA+Hv/YqVEREThJFqGwDzjezynBu5GJgI/A4cGXoswp4ICw/GNYJ2x9zdw/tV4erwxYAi4Cncl18dVmC0kRMh79ERDJIDN8lu9z9STO7D3gG6AWeBW4HfgzcY2ZfCm13hF3uAL5nZltIzVCuDp+zwczuJRVIvcD17p7zL483MxqqyzRTERHJIO+hAuDuNwE3DWh+lQxXb7n7UeCqQT7nZuDmrBc4jOnVpZqpiIhkoDvqx2B6dRl7D2qmIiIykEJlDKZVl7LvsEJFRGQghcoYTK8uY9+hbpLJjFcwi4gULYXKGEyvLqM36XQc0YMlRUTSKVTGYJruVRERyUihMgYzp5YDsEcn60VETqBQGYPZNalQ2dlxNOJKREQmFoXKGPTPVHZ1HIm4EhGRiUWhMgblJXHqq0o1UxERGUChMkazppazS6EiInIChcoYza4p10xFRGQAhcoYzaopZ9cBhYqISDqFyhjNrimn7XA3R3ty/mBkEZGCoVAZo1k1FQA6ryIikkahMkan6F4VEZGTKFTGaFYIlV0HdK+KiEg/hcoYzdJMRUTkJAqVMaosTVBTUaJzKiIiaRQq46B7VURETqRQGYdZNbqrXkQknUJlHDRTERE5kUJlHGZNrWDvoS66enUDpIgIKFTGpak+dQPkH9p1WbGICChUxmVufSUAW9s6I65ERGRiUKiMQ3+ovKFQEREBFCrj0jCljPKSGFv3KVREREChMi5mxtz6SrZppiIiAihUxm1ufZUOf4mIBAqVceqfqbh71KWIiEROoTJOc+sr6OzuY++h7qhLERGJnEJlnOZNqwJgW9vhiCsREYmeQmWcmsJlxTpZLyISUaiYWa2Z3WdmL5nZJjN7m5nVm9kaM2sJ73Whr5nZrWa2xcxeMLNz0j5nVejfYmarohjLnLoKzGDbPt1VLyIS1Uzl68DP3P0twNnAJuAGYK27LwLWhnWAy4BF4bUauA3AzOqBm4DzgOXATf1BlE/lJXFmTS1nqw5/iYjkP1TMbCrwDuAOAHfvdvf9wErgrtDtLuCKsLwS+K6n/B6oNbPZwKXAGndvc/d2YA2wIo9DOaapvlKXFYuIEM1M5U1AK/B/zOxZM/u2mVUBM919J0B4nxH6NwJvpO2/PbQN1p538+ordVe9iAjRhEoCOAe4zd2XAoc5fqgrE8vQ5kO0n/wBZqvNrNnMmltbW0db77Dm1ley52AXR7r1CHwRKW5RhMp2YLu7PxnW7yMVMrvDYS3C+560/k1p+88BdgzRfhJ3v93dl7n7soaGhqwNpN/caakrwLa3a7YiIsUt76Hi7ruAN8zstNB0MbAReBDov4JrFfBAWH4QuDZcBXY+0BEOjz0CXGJmdeEE/SWhLe+OPQJfh8BEpMglIvq5nwK+b2alwKvAx0kF3L1mdh2wDbgq9P0JcDmwBegMfXH3NjP7IvB06PcFd2/L3xCO0/eqiIikRBIq7v4csCzDposz9HXg+kE+507gzuxWN3r1VaXUVJTwSuuhqEsREYmU7qjPAjPj1JnVtOw+GHUpIiKRUqhkycIZU9i8+5CeViwiRU2hkiWnzqym40gPrYe6oi5FRCQyCpUsOXXmFABaduu8iogUL4VKliyaUQ2g8yoiUtQUKlnSMKWMmooSNu/RTEVEiteQoRIe/jjYtrnZL6dw9V8BtkWHv0SkiA03U/lF/4KZrR2w7T+yXk2BWzhjCpv3HNQVYCJStIYLlfSHNtYPsU1IXQG2v1NXgIlI8RouVHyQ5UzrRa//CjAdAhORYjXcY1pmmNlnSM1K+pcJ69l/3G+B678CbPPug7x94fSIqxERyb/hQuVbwJQMywDfzklFBaz/CrAWXQEmIkVqyFBx93/IVyGTwfFngClURKQ4DXdJ8Z+a2aKwbGZ2p5l1mNkLZrY0PyUWFl0BJiLFbLgT9Z8GXg/L1wBnk/qO+c8At+aurMLVfwXY3kPdUZciIpJ3w4VKr7v3hOX3A991933u/nOgKrelFabjzwDT41pEpPgMFypJM5ttZuWkvkDr52nbKnJXVuFKvwJMRKTYDHf11+eBZiAOPOjuGwDM7J2kvgZYBmiYUkZ9VSmbdipURKT4DBcqu4G3AQfdvd3MrgX+OLSvznVxhcjMeGtjDc9v3x91KSIieTfc4a9vAodCoLwD+DLwXVKh8vVcF1eozp5TQ8ueQxzp7ou6FBGRvBouVOLu3haW/wtwu7v/yN3/DliY29IK11lzaulLOht2dERdiohIXg0bKmbWf4jsYuCxtG3DHTorWmfNqQHg+e0KFREpLsMFw93AL81sL3AE+BWAmS0E9DfmIGZMLWfW1HJe0HkVESkywz2m5ebwPSqzgUf9+G3iMeBTuS6ukJ01p4YXNFMRkSIz7CEsd/99hrbNuSln8ji7qZZHN+6m40gPNRUlUZcjIpIX+o76HOk/r/KiZisiUkQUKjlyVmMtgO5XEZGiolDJkZrKEuZPq9TJehEpKgqVHDprTq1O1otIUVGo5NDSubXs7DjK9vbOqEsREckLhUoOLV9QD8DTr7cN01NEZHJQqOTQW2ZNZUp5gqdeU6iISHFQqORQPGb80fx6nlSoiEiRiCxUzCxuZs+a2cNhfYGZPWlmLWb2AzMrDe1lYX1L2D4/7TNuDO0vm9ml0YxkaMsX1PNq62FaD3ZFXYqISM5FOVP5NLApbf0rwC3uvghoB64L7dcB7e6+ELgl9MPMzgCuBs4EVgDfMLN4nmofMZ1XEZFiEkmomNkc4H3At8O6ARcB94UudwFXhOWVYZ2w/eLQfyVwj7t3uftrwBZgeX5GMHKLT6mhoiSu8yoiUhSimql8DfgckAzr04D97t4b1rcDjWG5EXgDIGzvCP2PtWfY5wRmttrMms2subW1NZvjGFZpIsY582p1XkVEikLeQ8XM3g/scfd16c0Zuvow24ba58RG99vdfZm7L2toaBhVvdmwfP40Xtp1gI4jPXn/2SIi+RTFTOUC4ANm9jpwD6nDXl8DatO+EGwOsCMsbweaAML2GqAtvT3DPhPK8gX1uEOzzquIyCSX91Bx9xvdfY67zyd1ov0xd/8w8DhwZei2CnggLD8Y1gnbHwvf6/IgcHW4OmwBsAh4Kk/DGJWlc2spS8T49Za9UZciIpJTE+krgf8GuMfMvgQ8C9wR2u8AvmdmW0jNUK4GcPcNZnYvsBHoBa539778lz288pI4571pGk9szu/5HBGRfIs0VNz9F8AvwvKrZLh6y92PAlcNsv/NwM25qzB73rFoOl/68Sa2t3cyp64y6nJERHJCd9TnyTtPTV0g8MRmHQITkclLoZInC2dUc0pNOY+/vCfqUkREckahkidmxiVnzuKJza10dvcOv4OISAFSqOTRJWfOpKs3yS9f1gl7EZmcFCp5tHx+PXWVJTyyYVfUpYiI5IRCJY8S8RjvOX0ma1/aQ3dvcvgdREQKjEIlz1YsnsXBo7389hVdBSYik49CJc8uWDidKWUJHnp+Z9SliIhknUIlz8pL4lz21ln8bP1OjnRPyAcAiIiMmUIlAh9cOofD3X08ulEn7EVkclGoROC8BfU01lZw/zN/iLoUEZGsUqhEIBYzVi45hV+1tLLn4NGoyxERyRqFSkQ+dM4cko5mKyIyqShUIrJwRjXL59dzz1PbSH09jIhI4VOoROia85p4fV8nv3t1X9SliIhkhUIlQpctnk1NRQl3P/VG1KWIiGSFQiVC5SVxPri0kUfW76LtcHfU5YiIjJtCJWLXLJ9Ld1+S+5/ZHnUpIiLjplCJ2GmzprBsXh3f+/1W+pI6YS8ihU2hMgF84sIFbN3XyZqNu6MuRURkXBQqE8ClZ86iqb6Cb/3q1ahLEREZF4XKBBCPGZ+4YAHrtrbzzLb2qMsRERkzhcoE8Z+XNVFTUcK/PbYl6lJERMZMoTJBVJUlWP2ON/HYS3tYt7Ut6nJERMZEoTKBfPyC+UyvLuMff/ayHt0iIgVJoTKBVJYm+NRFC3nytTZ+1aKvGxaRwqNQmWCuXt5EY20F//TIyyR134qIFBiFygRTlojz2UtO5cU/dHD/s3osvogUFoXKBHTFkkaWNNXylZ+9xKGu3qjLEREZMYXKBBSLGX//gTNpPdjFvz7WEnU5IiIjplCZoJY01XLluXO489ev8WrroajLEREZEYXKBPa5FadRXhLnhh+9qJP2IlIQ8h4qZtZkZo+b2SYz22Bmnw7t9Wa2xsxawntdaDczu9XMtpjZC2Z2TtpnrQr9W8xsVb7HkmszppTz+fefwVOvt/Gd374edTkiIsOKYqbSC3zW3U8HzgeuN7MzgBuAte6+CFgb1gEuAxaF12rgNkiFEHATcB6wHLipP4gmkyvPncNFb5nBPz7ykg6DiciEl/dQcfed7v5MWD4IbAIagZXAXaHbXcAVYXkl8F1P+T1Qa2azgUuBNe7e5u7twBpgRR6Hkhdmxv/60Fspjcf46/te0HeuiMiEFuk5FTObDywFngRmuvtOSAUPMCN0awTSv8R9e2gbrH3SmTm1nH9YeSbrtrZz2y/0wEkRmbgiCxUzqwZ+BPyVux8YqmuGNh+iPdPPWm1mzWbW3NraOvpiJ4ArljTygbNP4Z/XbOaXmwtzDCIy+UUSKmZWQipQvu/u94fm3eGwFuF9T2jfDjSl7T4H2DFE+0nc/XZ3X+buyxoaGrI3kDwyM778x2/ltJlT+Mu7n+WNts6oSxIROUkUV38ZcAewyd3/JW3Tg0D/FVyrgAfS2q8NV4GdD3SEw2OPAJeYWV04QX9JaJu0KksTfPOj5+LurP7eOo5090VdkojICaKYqVwAfBS4yMyeC6/LgS8D7zWzFuC9YR3gJ8CrwBbgW8AnAdy9Dfgi8HR4fSG0TWrzplXx9WuW8tKuA3z6nmfp7UtGXZKIyDFWbN/bsWzZMm9ubo66jHH7zm9e4+8f2siHljby1avOJhbLdIpJRGT8zGyduy8bSd9ErouR3PjYBQs41NXLVx/dTFVZgi+sPJPUkUURkegoVArY9e9eyMGjvXzziVcpTcT42/edrmARkUgpVAqYmXHDZW+hqzfJHb9+jYNHe/ifH3wribge6SYi0VCoFDgz46b/dAZTK0q4dW0LB4708vVrllCWiEddmogUIf2TdhIwMz7z3lP5u/efwc827OKjdzxF2+HuqMsSkSKkUJlErrtwAV+/egnPvbGfD/zbr9m0c6gHFYiIZJ9CZZJZuaSRH/7Z2+jpS/Khb/yWHzy9jWK7bFxEoqNQmYTObqrlob+4kKVza/mbH73IJ7//DPs7dThMRHJPoTJJzZhazv+97jxuvOwt/HzTbi792hM8/MIOzVpEJKcUKpNYLGb82TvfzL9/8gKmVZXxF//vWT5yx5O07D4YdWkiMkkpVIrA4sYaHvrUhXxx5Zm8uL2Dy77+K27+8UY6jvREXZqITDJ69leR2Xeoi68++jL3PP0G1aUJPn7BfD5x4QJqK0ujLk1EJqjRPPtLoVKkNu08wK1rW/jp+l1UlyX42Nvnc92FC6irUriIyIkUKkNQqJzopV0H+Ne1W/jJ+p1UlsT5yPnz+Mj582iqr4y6NBGZIBQqQ1CoZLZ590FuXdvCT17ciQPvPm0Gf7J8Lu88rYESPUtMpKgpVIagUBnazo4j3P3UG9z91DZaD3ZRV1nC+86azRVLGjlnbp2+t0WkCClUhqBQGZmeviRPbG7lP57bwZqNuzjak6SxtoIVi2dx6ZmzOHdeHXEFjEhRUKgMQaEyeoe6elmzcRcPPb+TX7fspbsvybSqUt79lhlcuHA6b184jRlTyqMuU0RyRKEyBIXK+Bzq6uUXL+/hkQ27eWJz67F7XU6dWc3b3zydP5pfz7nz6phVo5ARmSwUKkNQqGRPX9LZuOMAv3llL7/ZspenX2/jaE8SgMbaCpbNr+PceXWcM7eO02ZN0Ql/kQKlUBmCQiV3evqSbNxxgOat7azb2kbz6+3sOdgFQGk8xqmzqjlzdg2LG6dy6swpLJxRzbTqsoirFpHhKFSGoFDJH3dne/sRntnWzsYdB9iw4wAbdnTQ3nn88TD1VaUsbKjmzTOqWTijmjdNr6KpvpI5dRWUl+jbK0UmgtGEir5OWHLGzGiqr6SpvpKVSxqBVNDs7DhKy55DtOw+yCuth2jZfYifrt/J/rSwMYNZU8tpqq9kbtprdk05s2sqmDG1TKEjMgEpVCSvzIxTais4pbaCd57acKzd3dl3uJut+w6zra2TbfuOpN7bDvOrllZ2H+g66bPqKkuYObWcWTXlzJpazsyp5cyuKWdmTTkN1WXUVZVSV1lCRUkcM13+LJIPChWZEMyM6dVlTK8u49x59SdtP9rTx/b2TnbsP8quA0fZ3RHeD6Te1//hAPsOd5HpaG5pIkZtRQl1laXUVJZQV1lCbUUptVWpttqKEmorS6mtTK3XVZZQU1lCWUIzIZHRUqhIQSgvibNwxhQWzpgyaJ+eviR7Dnaxq+MorQe76DjSTXtnD+2d3XSE9/bOHl7f20l75372d/bQ3Zcc9PMqS+PHAqeuKhVEUytKqCqNU1mWoLosTmVpgqqyOFWlCarKElSWxqkuS6S2lyaoLIvrqjcpKgoVmTRK4jEaaytorK0YUX9350hPXyp4DnfTceR48HSE9/2dPezv7Ka9s5udHQc4cKSHzu4+Orv7RlxXImaUJWKUJmKUJeKUlcROXE9kWC9JLZeGbf3tx9ZLBqyH7eUlMUrjx39G/2fo6QeSLwoVKVpmRmVpgsrSxIiDqF9fMhVInV29HO7u43BXL4e7euns7uNwdy+dXX0cCm1dvUm6evtS7z2p5e6+/uXU+qGuXrp7w3pP6NubpLs3OeRsaqT6g22wMCpNxCiJx0jEjHjaKxEzYuE9HosRj0EiFju5jx3vG48ZcTPMONYnZqlXPEZ4D22hbzyW+u8RD9uO7Rv6DNzXSPUx49hyLPxMC/9t+5ePt6ftY0ZsYBuD7BM7/pmD7RMzC30o+vN3ChWRMYjHjOqyBNVluf9fKJn04yHU13dCGPUHT6YwOinIegfZr7ePoz1JDnf10udOb5/Tl3T6PPXe2+ck3elNhvbw6k0mjy0ni+vOhGGlh1t/EGGcEEr9QcSAQOxf5li/EwMvPbQGhlv4uFSoHuuUaptWVca9f/62nI9doSIywcViRnksHi6hLom6nIySaSGU9ONB09+edCeZJLV8Qh+nL8kJ60lnwOecuG9f0nFShy9TYea4g5P6nP5lP7ac2n9gmzsk05bd+z/3+GePaJ9hfnb6PviJ/fqXOWH/8LPTxnbiZ6f6939e/89LtXGsHxwfEw5TyvPz171CRUTGLRYzYhi6dUh0WYqIiGRNwYeKma0ws5fNbIuZ3RB1PSIixaygQ8XM4sD/Bi4DzgCuMbMzoq1KRKR4FXSoAMuBLe7+qrt3A/cAKyOuSUSkaBV6qDQCb6Stbw9tIiISgUIPlUx3GZ10xbyZrTazZjNrbm1tzUNZIiLFqdBDZTvQlLY+B9gxsJO73+7uy9x9WUNDw8DNIiKSJYUeKk8Di8xsgZmVAlcDD0Zck4hI0Sr4b340s8uBrwFx4E53v3mY/q3A1jH+uOnA3jHuW6g05uKgMReHsY55nruP6DBPwYdKPplZ80i/UnOy0JiLg8ZcHPIx5kI//CUiIhOIQkVERLJGoTI6t0ddQAQ05uKgMReHnI9Z51RERCRrNFMREZGsUaiMwGR9ErKZ3Wlme8xsfVpbvZmtMbOW8F4X2s3Mbg1/Bi+Y2TnRVT52ZtZkZo+b2SYz22Bmnw7tk3bcZlZuZk+Z2fNhzP8Q2heY2ZNhzD8I93phZmVhfUvYPj/K+sfDzOJm9qyZPRzWJ/WYzex1M3vRzJ4zs+bQltffbYXKMCb5k5C/A6wY0HYDsNbdFwFrwzqkxr8ovFYDt+WpxmzrBT7r7qcD5wPXh/+ek3ncXcBF7n42sARYYWbnA18BbgljbgeuC/2vA9rdfSFwS+hXqD4NbEpbL4Yxv9vdl6RdOpzf3+3U117qNdgLeBvwSNr6jcCNUdeVxfHNB9anrb8MzA7Ls4GXw/I3gWsy9SvkF/AA8N5iGTdQCTwDnEfqJrhEaD/2ew48ArwtLCdCP4u69jGMdQ6pv0QvAh4m9azAyT7m14HpA9ry+rutmcrwiu1JyDPdfSdAeJ8R2ifdn0M4xLEUeJJJPu5wGOg5YA+wBngF2O/uvaFL+riOjTls7wCm5bfirPga8DkgGdanMfnH7MCjZrbOzFaHtrz+bus76oc3oichF4FJ9edgZtXAj4C/cvcDZpmGl+qaoa3gxu3ufcASM6sF/h04PVO38F7wYzaz9wN73H2dmb2rvzlD10kz5uACd99hZjOANWb20hB9czJmzVSGN6InIU8iu81sNkB43xPaJ82fg5mVkAqU77v7/aF50o8bwN33A78gdT6p1sz6/2GZPq5jYw7ba4C2/FY6bhcAHzCz10l9ed9FpGYuk3nMuPuO8L6H1D8elpPn322FyvCK7UnIDwKrwvIqUucc+tuvDVeMnA909E+pC4mlpiR3AJvc/V/SNk3acZtZQ5ihYGYVwHtInbx+HLgydBs45v4/iyuBxzwcdC8U7n6ju89x9/mk/p99zN0/zCQes5lVmdmU/mXgEmA9+f7djvrEUiG8gMuBzaSOQ/+PqOvJ4rjuBnYCPaT+1XIdqePIa4GW8F4f+hqpq+BeAV4ElkVd/xjHfCGpKf4LwHPhdflkHjdwFvBsGPN64POh/U3AU8AW4IdAWWgvD+tbwvY3RT2GcY7/XcDDk33MYWzPh9eG/r+r8v27rTvqRUQka3T4S0REskahIiIiWaNQERGRrFGoiIhI1ihUREQkaxQqIllmZrVm9smwfIqZ3Rd1TSL5okuKRbIsPFPsYXdfHHEpInmnZ3+JZN+XgTeHBzi2AKe7+2Iz+xhwBRAHFgP/DJQCHyX1ePqBpO9yAAABR0lEQVTL3b3NzN5M6qa0BqAT+FN3f8nMrgJuAvpI3f38jjyPS2RYOvwlkn03AK+4+xLgrwdsWwz8CalnMt0MdLr7UuB3wLWhz+3Ap9z9XOC/A98I7Z8HLvXU96J8ILdDEBkbzVRE8utxdz8IHDSzDuCh0P4icFZ4evLbgR+mPTm5LLz/BviOmd0L3I/IBKRQEcmvrrTlZNp6ktT/jzFS3/mxZOCO7v7nZnYe8D7gOTNb4u77cl2wyGjo8JdI9h0EpoxlR3c/ALwWzp/0f4/42WH5ze7+pLt/ntQ3EzYN8VEikdBMRSTL3H2fmf3GzNZz4vejj9SHgdvM7G+BElLfB/I88E9mtojU02XXhjaRCUWXFIuISNbo8JeIiGSNQkVERLJGoSIiIlmjUBERkaxRqIiISNYoVEREJGsUKiIikjUKFRERyZr/D61Q+17sAsxUAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xa3ea4e0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure()\n",
    "plt.plot(RS.SSEscore)\n",
    "plt.xlabel('times')\n",
    "plt.ylabel('SSE')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "348.16848693137433"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "RS.SSEscore[-1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在基于用户和基于活动的协同过滤中，在训练集上能够较好的拟合标签，但在测试集上预测值全是0.5（程序中打分分为0，0.5，1三种情况），分析主要原因如下：训练集和测试集所有的活动数目为13418，而训练集的样本只有15398个，意味着对于每个活动，其用户只有1.15个，那么在测试集上，能够选取的训练集来协同分析的样本只有1-2个，且不论和训练集的样本是否有交叉部分，即使有交叉部分，也非常少，相似度也不一定为正，而且交叉部分少，结果也不可靠。总之在基于用户和基于活动的协同过滤中，由于样本太稀疏，经常出现样本为0分母为0等各种情况，训练集上还好，测试集基本无效果。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "基于模型的协同过滤中，SSE收敛于350左右，采用的批量梯度下降，最初采用的梯度下降出现梯度爆炸，对训练集和测试集的效果要好于上面两种。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
